Problem Statement

Business Scenario • The data provided is from a Personal Loans Campaign executed by MyBank. • 20000 customers were targeted with an offer of Personal Loans at 10% interest rate. • 2512 customers out of 20000 responded expressing their need for Personal Loan; These customers are labelled as Target = 1 and remaining customers are labelled as Target = 0

The motive of thi sproject is to build a machine learning model to predict the target variable

#Importing the dataset
raw_data <- read.csv('Personal Loan Campaign-dataset.csv', header = TRUE)
print(head(raw_data))
##   CUST_ID TARGET AGE GENDER    BALANCE OCCUPATION AGE_BKT SCR
## 1   C7927      0  27      M    3383.75   SELF-EMP   26-30 776
## 2   C6877      0  47      M  287489.04        SAL   46-50 324
## 3  C19922      0  40      M   18216.88   SELF-EMP   36-40 603
## 4   C8183      0  53      M   71720.48        SAL     >50 196
## 5  C12123      0  36      M 1671622.89       PROF   36-40 167
## 6    C257      0  42      F  521685.69       PROF   41-45 493
##   HOLDING_PERIOD ACC_TYPE ACC_OP_DATE LEN_OF_RLTN_IN_MNTH NO_OF_L_CR_TXNS
## 1             30       SA   3/23/2005                 146               7
## 2             28       SA    10-11-08                 104               8
## 3              2       SA   4/26/2012                  61              10
## 4             13       CA    07-04-08                 107              36
## 5             24       SA  12/29/2001                 185              20
## 6             26       SA    06-07-01                 192               5
##   NO_OF_L_DR_TXNS TOT_NO_OF_L_TXNS NO_OF_BR_CSH_WDL_DR_TXNS
## 1               3               10                        0
## 2               2               10                        0
## 3               5               15                        1
## 4              14               50                        4
## 5               1               21                        1
## 6               2                7                        1
##   NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS NO_OF_MOB_DR_TXNS NO_OF_CHQ_DR_TXNS
## 1                 1                 2                 0                 0
## 2                 1                 1                 0                 0
## 3                 1                 1                 0                 2
## 4                 2                 3                 1                 4
## 5                 0                 0                 0                 0
## 6                 1                 0                 0                 0
##   FLG_HAS_CC AMT_ATM_DR AMT_BR_CSH_WDL_DR AMT_CHQ_DR AMT_NET_DR AMT_MOB_DR
## 1          0      13100                 0          0     973557          0
## 2          0       6600                 0          0     799813          0
## 3          0      11200            561120      49320     997570          0
## 4          0      26100            673590      60780     741506      71388
## 5          0          0            808480          0          0          0
## 6          1      18500            379310          0          0          0
##   AMT_L_DR FLG_HAS_ANY_CHGS AMT_OTH_BK_ATM_USG_CHGS AMT_MIN_BAL_NMC_CHGS
## 1   986657                0                       0                    0
## 2   806413                1                       0                    0
## 3  1619210                1                       0                    0
## 4  1573364                0                       0                    0
## 5   808480                0                       0                    0
## 6   397810                0                       0                    0
##   NO_OF_IW_CHQ_BNC_TXNS NO_OF_OW_CHQ_BNC_TXNS AVG_AMT_PER_ATM_TXN
## 1                     0                     0               13100
## 2                     0                     0                6600
## 3                     0                     1               11200
## 4                     0                     0               13050
## 5                     0                     0                   0
## 6                     0                     0               18500
##   AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_CHQ_TXN AVG_AMT_PER_NET_TXN
## 1                     0.0                   0            486778.5
## 2                     0.0                   0            799813.0
## 3                561120.0               24660            997570.0
## 4                168397.5               15195            247168.7
## 5                808480.0                   0                 0.0
## 6                379310.0                   0                 0.0
##   AVG_AMT_PER_MOB_TXN FLG_HAS_NOMINEE FLG_HAS_OLD_LOAN      random
## 1                   0               1                1 0.000011400
## 2                   0               1                0 0.000111373
## 3                   0               1                1 0.000119954
## 4               71388               1                0 0.000136825
## 5                   0               1                0 0.000173976
## 6                   0               1                1 0.000405840
# Looking at the summary dataset on a high level
summary(raw_data)
##     CUST_ID          TARGET            AGE        GENDER   
##  C1     :    1   Min.   :0.0000   Min.   :21.00   F: 5433  
##  C10    :    1   1st Qu.:0.0000   1st Qu.:30.00   M:14376  
##  C100   :    1   Median :0.0000   Median :38.00   O:  191  
##  C1000  :    1   Mean   :0.1256   Mean   :38.42            
##  C10000 :    1   3rd Qu.:0.0000   3rd Qu.:46.00            
##  C10001 :    1   Max.   :1.0000   Max.   :55.00            
##  (Other):19994                                             
##     BALANCE           OCCUPATION    AGE_BKT          SCR       
##  Min.   :      0   PROF    :5417   <25  :1753   Min.   :100.0  
##  1st Qu.:  64754   SAL     :5855   26-30:3434   1st Qu.:227.0  
##  Median : 231676   SELF-EMP:3568   31-35:3404   Median :364.0  
##  Mean   : 511362   SENP    :5160   36-40:2814   Mean   :440.2  
##  3rd Qu.: 653877                   41-45:3067   3rd Qu.:644.0  
##  Max.   :8360431                   46-50:2493   Max.   :999.0  
##                                    >50  :3035                  
##  HOLDING_PERIOD  ACC_TYPE       ACC_OP_DATE    LEN_OF_RLTN_IN_MNTH
##  Min.   : 1.00   CA: 4241   11/16/2010:   24   Min.   : 29.0      
##  1st Qu.: 7.00   SA:15759   04-03-09  :   23   1st Qu.: 79.0      
##  Median :15.00              7/25/2010 :   22   Median :125.0      
##  Mean   :14.96              05-06-13  :   21   Mean   :125.2      
##  3rd Qu.:22.00              02-07-07  :   20   3rd Qu.:172.0      
##  Max.   :31.00              8/24/2010 :   20   Max.   :221.0      
##                             (Other)   :19870                      
##  NO_OF_L_CR_TXNS NO_OF_L_DR_TXNS  TOT_NO_OF_L_TXNS
##  Min.   : 0.00   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.: 6.00   1st Qu.: 2.000   1st Qu.:  9.00  
##  Median :10.00   Median : 5.000   Median : 14.00  
##  Mean   :12.35   Mean   : 6.634   Mean   : 18.98  
##  3rd Qu.:14.00   3rd Qu.: 7.000   3rd Qu.: 21.00  
##  Max.   :75.00   Max.   :74.000   Max.   :149.00  
##                                                   
##  NO_OF_BR_CSH_WDL_DR_TXNS NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS
##  Min.   : 0.000           Min.   : 0.000    Min.   : 0.000   
##  1st Qu.: 1.000           1st Qu.: 0.000    1st Qu.: 0.000   
##  Median : 1.000           Median : 1.000    Median : 0.000   
##  Mean   : 1.883           Mean   : 1.029    Mean   : 1.172   
##  3rd Qu.: 2.000           3rd Qu.: 1.000    3rd Qu.: 1.000   
##  Max.   :15.000           Max.   :25.000    Max.   :22.000   
##                                                              
##  NO_OF_MOB_DR_TXNS NO_OF_CHQ_DR_TXNS   FLG_HAS_CC       AMT_ATM_DR    
##  Min.   : 0.0000   Min.   : 0.000    Min.   :0.0000   Min.   :     0  
##  1st Qu.: 0.0000   1st Qu.: 0.000    1st Qu.:0.0000   1st Qu.:     0  
##  Median : 0.0000   Median : 2.000    Median :0.0000   Median :  6900  
##  Mean   : 0.4118   Mean   : 2.138    Mean   :0.3054   Mean   : 10990  
##  3rd Qu.: 0.0000   3rd Qu.: 4.000    3rd Qu.:1.0000   3rd Qu.: 15800  
##  Max.   :25.0000   Max.   :15.000    Max.   :1.0000   Max.   :199300  
##                                                                       
##  AMT_BR_CSH_WDL_DR   AMT_CHQ_DR        AMT_NET_DR       AMT_MOB_DR    
##  Min.   :     0    Min.   :      0   Min.   :     0   Min.   :     0  
##  1st Qu.:  2990    1st Qu.:      0   1st Qu.:     0   1st Qu.:     0  
##  Median :340150    Median :  23840   Median :     0   Median :     0  
##  Mean   :378474    Mean   : 124520   Mean   :237308   Mean   : 22425  
##  3rd Qu.:674675    3rd Qu.:  72470   3rd Qu.:473970   3rd Qu.:     0  
##  Max.   :999930    Max.   :4928640   Max.   :999854   Max.   :199667  
##                                                                       
##     AMT_L_DR       FLG_HAS_ANY_CHGS AMT_OTH_BK_ATM_USG_CHGS
##  Min.   :      0   Min.   :0.0000   Min.   :  0.000        
##  1st Qu.: 237936   1st Qu.:0.0000   1st Qu.:  0.000        
##  Median : 695115   Median :0.0000   Median :  0.000        
##  Mean   : 773717   Mean   :0.1106   Mean   :  1.099        
##  3rd Qu.:1078927   3rd Qu.:0.0000   3rd Qu.:  0.000        
##  Max.   :6514921   Max.   :1.0000   Max.   :250.000        
##                                                            
##  AMT_MIN_BAL_NMC_CHGS NO_OF_IW_CHQ_BNC_TXNS NO_OF_OW_CHQ_BNC_TXNS
##  Min.   :  0.000      Min.   :0.00000       Min.   :0.0000       
##  1st Qu.:  0.000      1st Qu.:0.00000       1st Qu.:0.0000       
##  Median :  0.000      Median :0.00000       Median :0.0000       
##  Mean   :  1.292      Mean   :0.04275       Mean   :0.0444       
##  3rd Qu.:  0.000      3rd Qu.:0.00000       3rd Qu.:0.0000       
##  Max.   :170.000      Max.   :2.00000       Max.   :2.0000       
##                                                                  
##  AVG_AMT_PER_ATM_TXN AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_CHQ_TXN
##  Min.   :    0       Min.   :     0          Min.   :     0     
##  1st Qu.:    0       1st Qu.:  1266          1st Qu.:     0     
##  Median : 6000       Median :147095          Median :  8645     
##  Mean   : 7409       Mean   :242236          Mean   : 25092     
##  3rd Qu.:13500       3rd Qu.:385000          3rd Qu.: 28605     
##  Max.   :25000       Max.   :999640          Max.   :537842     
##                                                                 
##  AVG_AMT_PER_NET_TXN AVG_AMT_PER_MOB_TXN FLG_HAS_NOMINEE  FLG_HAS_OLD_LOAN
##  Min.   :     0      Min.   :     0      Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:     0      1st Qu.:     0      1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :     0      Median :     0      Median :1.0000   Median :0.0000  
##  Mean   :179059      Mean   : 20304      Mean   :0.9012   Mean   :0.4929  
##  3rd Qu.:257699      3rd Qu.:     0      3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :999854      Max.   :199667      Max.   :1.0000   Max.   :1.0000  
##                                                                           
##      random         
##  Min.   :0.0000114  
##  1st Qu.:0.2481866  
##  Median :0.5061214  
##  Mean   :0.5019330  
##  3rd Qu.:0.7535712  
##  Max.   :0.9999471  
## 

The dataset looks clean without any missing values.

Exploring the data

require('DataExplorer')
## Loading required package: DataExplorer
Data Profiling Report

Basic Statistics

Raw Counts

Name Value
Rows 20,000
Columns 40
Discrete columns 6
Continuous columns 34
All missing columns 0
Missing observations 0
Complete Rows 20,000
Total observations 800,000
Memory allocation 5.3 Mb

Percentages

Data Structure

Missing Data Profile

Univariate Distribution

Histogram

Bar Chart (by frequency)

## 2 columns ignored with more than 50 categories.
## CUST_ID: 20000 categories
## ACC_OP_DATE: 4869 categories

QQ Plot

Correlation Analysis

## 2 features with more than 20 categories ignored!
## CUST_ID: 20000 categories
## ACC_OP_DATE: 4869 categories

Principal Component Analysis

## 2 features with more than 50 categories ignored!
## CUST_ID: 20000 categories
## ACC_OP_DATE: 4869 categories

# Removing cust_id column as it seems not important because it is just an incremented value
raw_data <- subset(raw_data, select = c(-CUST_ID))

Check for zero and non zero variance

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
nzv <- nearZeroVar(raw_data, saveMetrics=TRUE)
nzv <-cbind(row_name = rownames(nzv),nzv)
print(nzv) # this shows NO near zero or zero varaiance
##                                          row_name   freqRatio
## TARGET                                     TARGET    6.961783
## AGE                                           AGE    1.073171
## GENDER                                     GENDER    2.646052
## BALANCE                                   BALANCE    2.111111
## OCCUPATION                             OCCUPATION    1.080857
## AGE_BKT                                   AGE_BKT    1.008813
## SCR                                           SCR    1.044776
## HOLDING_PERIOD                     HOLDING_PERIOD    1.336165
## ACC_TYPE                                 ACC_TYPE    3.715869
## ACC_OP_DATE                           ACC_OP_DATE    1.043478
## LEN_OF_RLTN_IN_MNTH           LEN_OF_RLTN_IN_MNTH    1.025316
## NO_OF_L_CR_TXNS                   NO_OF_L_CR_TXNS    1.553664
## NO_OF_L_DR_TXNS                   NO_OF_L_DR_TXNS    1.109273
## TOT_NO_OF_L_TXNS                 TOT_NO_OF_L_TXNS    1.189585
## NO_OF_BR_CSH_WDL_DR_TXNS NO_OF_BR_CSH_WDL_DR_TXNS    1.347703
## NO_OF_ATM_DR_TXNS               NO_OF_ATM_DR_TXNS    1.573045
## NO_OF_NET_DR_TXNS               NO_OF_NET_DR_TXNS    1.635942
## NO_OF_MOB_DR_TXNS               NO_OF_MOB_DR_TXNS    4.021603
## NO_OF_CHQ_DR_TXNS               NO_OF_CHQ_DR_TXNS    1.682764
## FLG_HAS_CC                             FLG_HAS_CC    2.274394
## AMT_ATM_DR                             AMT_ATM_DR   66.094737
## AMT_BR_CSH_WDL_DR               AMT_BR_CSH_WDL_DR  331.533333
## AMT_CHQ_DR                             AMT_CHQ_DR  405.142857
## AMT_NET_DR                             AMT_NET_DR  813.692308
## AMT_MOB_DR                             AMT_MOB_DR 1545.100000
## AMT_L_DR                                 AMT_L_DR   57.692308
## FLG_HAS_ANY_CHGS                 FLG_HAS_ANY_CHGS    8.041591
## AMT_OTH_BK_ATM_USG_CHGS   AMT_OTH_BK_ATM_USG_CHGS  451.340909
## AMT_MIN_BAL_NMC_CHGS         AMT_MIN_BAL_NMC_CHGS  130.578947
## NO_OF_IW_CHQ_BNC_TXNS       NO_OF_IW_CHQ_BNC_TXNS   22.553592
## NO_OF_OW_CHQ_BNC_TXNS       NO_OF_OW_CHQ_BNC_TXNS   21.572235
## AVG_AMT_PER_ATM_TXN           AVG_AMT_PER_ATM_TXN   62.168317
## AVG_AMT_PER_CSH_WDL_TXN   AVG_AMT_PER_CSH_WDL_TXN  452.090909
## AVG_AMT_PER_CHQ_TXN           AVG_AMT_PER_CHQ_TXN  425.400000
## AVG_AMT_PER_NET_TXN           AVG_AMT_PER_NET_TXN  813.692308
## AVG_AMT_PER_MOB_TXN           AVG_AMT_PER_MOB_TXN 1545.100000
## FLG_HAS_NOMINEE                   FLG_HAS_NOMINEE    9.116338
## FLG_HAS_OLD_LOAN                 FLG_HAS_OLD_LOAN    1.028603
## random                                     random    1.000000
##                          percentUnique zeroVar   nzv
## TARGET                           0.010   FALSE FALSE
## AGE                              0.175   FALSE FALSE
## GENDER                           0.015   FALSE FALSE
## BALANCE                         49.620   FALSE FALSE
## OCCUPATION                       0.020   FALSE FALSE
## AGE_BKT                          0.035   FALSE FALSE
## SCR                              4.480   FALSE FALSE
## HOLDING_PERIOD                   0.155   FALSE FALSE
## ACC_TYPE                         0.010   FALSE FALSE
## ACC_OP_DATE                     24.345   FALSE FALSE
## LEN_OF_RLTN_IN_MNTH              0.965   FALSE FALSE
## NO_OF_L_CR_TXNS                  0.380   FALSE FALSE
## NO_OF_L_DR_TXNS                  0.240   FALSE FALSE
## TOT_NO_OF_L_TXNS                 0.490   FALSE FALSE
## NO_OF_BR_CSH_WDL_DR_TXNS         0.080   FALSE FALSE
## NO_OF_ATM_DR_TXNS                0.130   FALSE FALSE
## NO_OF_NET_DR_TXNS                0.085   FALSE FALSE
## NO_OF_MOB_DR_TXNS                0.030   FALSE FALSE
## NO_OF_CHQ_DR_TXNS                0.080   FALSE FALSE
## FLG_HAS_CC                       0.010   FALSE FALSE
## AMT_ATM_DR                       3.230   FALSE  TRUE
## AMT_BR_CSH_WDL_DR               36.845   FALSE FALSE
## AMT_CHQ_DR                      24.455   FALSE FALSE
## AMT_NET_DR                      24.235   FALSE FALSE
## AMT_MOB_DR                      11.560   FALSE FALSE
## AMT_L_DR                        47.980   FALSE FALSE
## FLG_HAS_ANY_CHGS                 0.010   FALSE FALSE
## AMT_OTH_BK_ATM_USG_CHGS          0.030   FALSE  TRUE
## AMT_MIN_BAL_NMC_CHGS             0.010   FALSE  TRUE
## NO_OF_IW_CHQ_BNC_TXNS            0.015   FALSE  TRUE
## NO_OF_OW_CHQ_BNC_TXNS            0.015   FALSE  TRUE
## AVG_AMT_PER_ATM_TXN              3.760   FALSE  TRUE
## AVG_AMT_PER_CSH_WDL_TXN         37.570   FALSE FALSE
## AVG_AMT_PER_CHQ_TXN             26.730   FALSE FALSE
## AVG_AMT_PER_NET_TXN             24.285   FALSE FALSE
## AVG_AMT_PER_MOB_TXN             11.580   FALSE FALSE
## FLG_HAS_NOMINEE                  0.010   FALSE FALSE
## FLG_HAS_OLD_LOAN                 0.010   FALSE FALSE
## random                         100.000   FALSE FALSE